/* Dataset description:
Dataset: BeH V10.07-202112
Years: 1997 - 2019

Prepraperation by DIM before delivery: 
1) drop spells with beh_tag_entg (daily wage) == 0 or missing
2) only keep spells with beh_prs_gr (person group) == 101, 102, 140, 141 or 143
   ==> therefore, inter alia, marginal part-time spells are dropped
   
Variables in dataset:
prs_id, beh_beg_dat, beh_end_dat, betnr, berufstellg, berufstellg_imp_num, beh_pers_gr, beh_abg, 
sex_id, beh_tag_entg, geb_dat_year, geb_dat_month, beh_ausbildung_imp, beh_ausbildung, beh_beruf_num, beh_staat_dummy
*/

use  "$data\a015784_beh", clear

drop beh_ausbildung
drop geb_dat_month
compress

set seed 24112020

drop if missing(betnr)
assert betnr != 1 // Will later be used to assign non-employment
assert betnr != 2 // Will later be used to assign non-employment

* Replace missings in/ correct berufstellg with imputed berufstellg
replace berufstellg = berufstellg_imp_num if !missing(berufstellg_imp_num) & (berufstellg_imp_num == 20 | berufstellg_imp_num == 21)
drop berufstellg_imp_num

* Rename variables
rename prs_id             persnr
rename beh_beg_dat        begorig
rename beh_end_dat        endorig
rename beh_tag_entg       tentgelt
rename beh_ausbildung_imp beh_ausbildung

* (Re-)label variable
label var persnr        "Person ID"
label var begorig       "Begin of spell"
label var endorig       "End of spell"
label var betnr         "Establishment ID"
label var berufstellg   "Occupational classification"
label var beh_pers_gr   "Group of persons"
label var beh_abg       "Reason for submission"
label var sex_id        "Sex"
label var tentgelt      "Daily wage"
label var geb_dat_year  "Year of birth"

* Drop "Heimarbeiter", "Berufstellung 5", and "Berufstellung6" (either full- nor part-time)
tab berufstellg, mis
drop if inrange(berufstellg,5,7)
* Drop if occupational classification is missings
drop if inlist(berufstellg,-9,-7,-5)
drop if missing(berufstellg)

* Identify part-time (pt) spells, trainees counted as pt workers
tab beh_pers_gr if berufstellg == 0
gen byte pt = inlist(berufstellg,0,8,9,21)
replace pt = 1 if inlist(beh_pers_gr,102,141)
label var pt "Part-time dummy"
tab pt, mis

* Generate year
gen year = year(endorig)
drop if missing(year)
label var year "Year"

* Generate spell-length
gen spell_length = endorig - begorig + 1
label var spell_length "Spell length in days"
assert spell_length >= 1 & spell_length <= 366

compress

* Include wage of 54 spells in employment spells. Divide entgelt from beh_abg == 54 on other spells depending on spell length
su tentgelt if beh_abg != 54
gen byte tag = (beh_abg == 54)
bysort persnr betnr year: egen tag2 = max(tag)
drop tag
gen entgelt = spell_length * tentgelt
compress
bysort persnr betnr year: egen spell_lenght_est = sum(spell_length) if beh_abg != 54
replace spell_lenght_est = spell_length / spell_lenght_est
compress
gen temp = entgelt if beh_abg == 54
bysort persnr betnr year: egen entgelt54 = sum(temp)
drop temp
replace entgelt = entgelt + spell_lenght_est * entgelt54
drop spell_lenght_est entgelt54
drop if beh_abg == 54
replace tentgelt = entgelt / spell_length if tag2 == 1
drop tag2 entgelt
drop beh_abg
compress
su tentgelt

* Round daily wage
replace tentgelt = round(tentgelt,.01)

* Drop spells where daily wages are too small. Use marginal earning threshold (Geringfügigkeitsgrenze): April 1999 - April 2003: 325 €. April 2003 - Dec.2012: 400 €. Since Jan. 2013: 450 €.
drop if tentgelt <= (325/30) & year >= 1997 & year <= 2002 & (!inlist(beh_pers_gr,102,141) | !inlist(berufstellg,0))
drop if tentgelt <= (400/30) & year >= 2003 & year <= 2012 & (!inlist(beh_pers_gr,102,141) | !inlist(berufstellg,0))
drop if tentgelt <= (450/30) & year >= 2013 & year <= 2019 & (!inlist(beh_pers_gr,102,141) | !inlist(berufstellg,0))

drop berufstellg

* Drop duplicates
duplicates drop

* Generate age
gen age = year - geb_dat_year
label var age "Age in years"
drop geb_dat_year
keep if age >= 15 & age <= 66


* Generate spell ID
sort persnr begorig endorig tentgelt
bysort persnr: gen double spell = _n

order betnr, after(persnr)

compress
save "$temp\BeH_large.dta", replace
